{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json)); using driver found in the cache\n",
      "Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Cookies 加载成功！\n",
      "✅ 已使用 Cookies 登录，无需手动输入密码！\n",
      "🎉 已成功进入 Pixiv！\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.by import By\n",
    "from selenium.webdriver.common.action_chains import ActionChains\n",
    "from selenium.webdriver.chrome.options import Options\n",
    "import requests\n",
    "import pickle\n",
    "import time\n",
    "import bs4\n",
    "import os\n",
    "\n",
    "# 浏览器设置\n",
    "options = Options()\n",
    "options.headless = False  # 关闭无头模式，避免被检测\n",
    "driver = webdriver.Chrome(options=options)\n",
    "\n",
    "# Pixiv 主页\n",
    "pixiv_url = \"https://www.pixiv.net/\"\n",
    "\n",
    "# **1. 加载 cookies**\n",
    "def load_cookies():\n",
    "    if os.path.exists(\"cookies.pkl\"):\n",
    "        driver.get(pixiv_url)\n",
    "        time.sleep(3)  # 等待页面加载\n",
    "        with open(\"cookies.pkl\", \"rb\") as f:\n",
    "            cookies = pickle.load(f)\n",
    "            for cookie in cookies:\n",
    "                driver.add_cookie(cookie)\n",
    "        print(\"✅ Cookies 加载成功！\")\n",
    "        driver.refresh()  # 刷新页面，使 cookies 生效\n",
    "        time.sleep(5)\n",
    "        return True\n",
    "    return False\n",
    "\n",
    "# **2. 登录并保存 cookies**\n",
    "def login_and_save_cookies():\n",
    "    driver.get(\"https://accounts.pixiv.net/login\")\n",
    "    print(\"🔑 请手动登录 Pixiv...\")\n",
    "    time.sleep(30)  # 给用户足够时间登录\n",
    "\n",
    "    # 登录后，保存 cookies\n",
    "    cookies = driver.get_cookies()\n",
    "    print(cookies)\n",
    "    with open(\"cookies.pkl\", \"wb\") as f:\n",
    "        pickle.dump(cookies, f)\n",
    "    print(\"✅ 登录成功，Cookies 已保存！\")\n",
    "\n",
    "# **3. 主运行逻辑**\n",
    "if not load_cookies():\n",
    "    login_and_save_cookies()\n",
    "else:\n",
    "    print(\"✅ 已使用 Cookies 登录，无需手动输入密码！\")\n",
    "\n",
    "# **4. 爬取 Pixiv 页面**\n",
    "driver.get(pixiv_url)\n",
    "time.sleep(5)\n",
    "print(\"🎉 已成功进入 Pixiv！\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "已获取第 1 页的链接\n",
      "已获取第 2 页的链接\n",
      "已获取第 3 页的链接\n",
      "已获取第 4 页的链接\n",
      "已获取第 5 页的链接\n",
      "All links have been collected\n"
     ]
    }
   ],
   "source": [
    "# 登录 Pixiv\n",
    "page_links = []\n",
    "\n",
    "def get_pixiv_series_images(url = \"https://www.pixiv.net/user/14601125/series/217742?p=1#seriesContents\"):\n",
    "    try:\n",
    "        driver.get(url)\n",
    "\n",
    "        # 等待页面加载\n",
    "        time.sleep(3)\n",
    "\n",
    "        # 模拟滚动加载更多内容\n",
    "        for _ in range(2):  # 根据需要调整滚动次数\n",
    "            driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
    "            time.sleep(2)\n",
    "\n",
    "        # 获取页面 HTML\n",
    "        page_source = driver.page_source\n",
    "\n",
    "        # 使用 BeautifulSoup 解析 HTML\n",
    "        soup = bs4.BeautifulSoup(page_source, \"html.parser\")\n",
    "        links = soup.find_all(\"a\", class_=\"sc-d00cb9ca-0 jmvyWD sc-5a760b36-6 bsJslY gtm-gtm-manga-series-thumbnail\")\n",
    "        for link in links:\n",
    "            href = link.get(\"href\")\n",
    "            if href not in page_links:\n",
    "                page_links.append(\"https://www.pixiv.net\" + href)\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "\n",
    "for i in range(1, 5 + 1):\n",
    "    url = f\"https://www.pixiv.net/user/14601125/series/217742?p={i}#seriesContents\"\n",
    "    get_pixiv_series_images(url)\n",
    "    print(f\"已获取第 {i} 页的链接\")\n",
    "    time.sleep(2)\n",
    "\n",
    "print(\"All links have been collected\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n"
     ]
    }
   ],
   "source": [
    "def download_image(url, filename):\n",
    "    headers = {\n",
    "        \"Referer\": \"https://www.pixiv.net/\",\n",
    "        \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36\"\n",
    "    }\n",
    "\n",
    "    session = requests.Session()\n",
    "    for cookie in driver.get_cookies():\n",
    "        session.cookies.set(cookie['name'], cookie['value'])\n",
    "    \n",
    "    response = session.get(url, headers=headers, stream=True) # stream=True 防止下载大文件时内存不够\n",
    "\n",
    "    if response.status_code == 200:\n",
    "        with open(filename, \"wb\") as f:\n",
    "            for chunk in response.iter_content(chunk_size=1024):  # 默认的 chunk_size=8192，这里改为 1024\n",
    "                f.write(chunk)\n",
    "        print(f\"✅ 下载完成: {filename}\")\n",
    "    else:\n",
    "        print(f\"❌ 下载失败: {url}, 状态码: {response.status_code}\")\n",
    "\n",
    "def get_image_links(url):\n",
    "    print(url)\n",
    "    driver.get(url)\n",
    "\n",
    "    # 等待页面加载\n",
    "    time.sleep(3)\n",
    "\n",
    "    # 点击 \"查看全部\" 按钮\n",
    "    \"\"\"<div class=\"sc-emr523-2 wEKy\">阅读作品 </div>\n",
    "    //*[@id=\"root\"]/div[2]/div/div[3]/div/div/div[1]/main/section/div[1]/div/div[5]/div/div/button/div[2]\"\"\"\n",
    "    driver.find_element(By.XPATH, '//*[@id=\"root\"]/div[2]/div/div[3]/div/div/div[1]/main/section/div[1]/div/div[5]/div/div/button/div[2]').click()\n",
    "\n",
    "    # 模拟滚动加载更多内容\n",
    "    for _ in range(2):  # 根据需要调整滚动次数\n",
    "        driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
    "        time.sleep(2)\n",
    "\n",
    "    # 获取页面 HTML\n",
    "    page_source = driver.page_source\n",
    "\n",
    "    soup = bs4.BeautifulSoup(page_source, \"html.parser\")\n",
    "    links = soup.find_all(\"a\", class_=\"gtm-expand-full-size-illust\")\n",
    "    for i, link in enumerate(links):\n",
    "        href = link.get(\"href\")\n",
    "        filename = link.get(\"href\").split(\"/\")[-1]\n",
    "        download_image(href, filename)\n",
    "        print(f\"已经下载{i + 1}/{len(links)}张图片\")\n",
    "        time.sleep(1)\n",
    "\n",
    "    print(\"All images have been downloaded\")\n",
    "\n",
    "print(page_links)\n",
    "\n",
    "for link in page_links:\n",
    "    get_image_links(link)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
